{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "# 其他分析    \n",
    "1. 标签关联：统计高频标签组合（词云图）\n",
    "2. 字数-评分：散点图观察相关性"
   ],
   "id": "d855c3ed41380afc"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:35:58.648987Z",
     "start_time": "2025-02-26T00:35:58.299302Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from pymongo import MongoClient\n",
    "# 连接到MongoDB\n",
    "client = MongoClient('mongodb://localhost:27017/')\n",
    "# 打开learn数据库\n",
    "db = client['learn']\n",
    "# 在数据库中打开seven_cats集合\n",
    "books = db['seven_cats']"
   ],
   "id": "d1530e8a352fde73",
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:36:08.550796Z",
     "start_time": "2025-02-26T00:36:07.979331Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "# 将MongoDB数据转换为pandas数据\n",
    "add = pd.DataFrame(list(books.find({}, {'_id': 0, 'label': 1, 'wordcount': 1, 'score': 1})))\n",
    "add['count'] = 1\n",
    "add"
   ],
   "id": "b933de6ec89b6070",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "      score label  wordcount  count\n",
       "0       9.3  热血校园     272.47      1\n",
       "1       9.1  古典仙侠      38.60      1\n",
       "2       8.8  东方玄幻      61.96      1\n",
       "3       9.3  异世大陆     664.36      1\n",
       "4       9.1  东方玄幻     552.87      1\n",
       "...     ...   ...        ...    ...\n",
       "7713    9.3  星际机甲     329.66      1\n",
       "7714    8.9  末世危机     424.60      1\n",
       "7715    9.1  都市高武     135.02      1\n",
       "7716    9.0  衍生同人      82.74      1\n",
       "7717    9.0  衍生同人      85.03      1\n",
       "\n",
       "[7718 rows x 4 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>score</th>\n",
       "      <th>label</th>\n",
       "      <th>wordcount</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9.3</td>\n",
       "      <td>热血校园</td>\n",
       "      <td>272.47</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9.1</td>\n",
       "      <td>古典仙侠</td>\n",
       "      <td>38.60</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8.8</td>\n",
       "      <td>东方玄幻</td>\n",
       "      <td>61.96</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9.3</td>\n",
       "      <td>异世大陆</td>\n",
       "      <td>664.36</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9.1</td>\n",
       "      <td>东方玄幻</td>\n",
       "      <td>552.87</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7713</th>\n",
       "      <td>9.3</td>\n",
       "      <td>星际机甲</td>\n",
       "      <td>329.66</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7714</th>\n",
       "      <td>8.9</td>\n",
       "      <td>末世危机</td>\n",
       "      <td>424.60</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7715</th>\n",
       "      <td>9.1</td>\n",
       "      <td>都市高武</td>\n",
       "      <td>135.02</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7716</th>\n",
       "      <td>9.0</td>\n",
       "      <td>衍生同人</td>\n",
       "      <td>82.74</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7717</th>\n",
       "      <td>9.0</td>\n",
       "      <td>衍生同人</td>\n",
       "      <td>85.03</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7718 rows × 4 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:36:08.638658Z",
     "start_time": "2025-02-26T00:36:08.551807Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 导入可视化工具\n",
    "from pyecharts import options as opts"
   ],
   "id": "850e902a1b69f417",
   "outputs": [],
   "execution_count": 3
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## 一、标签关联：统计高频标签组合",
   "id": "7ef3c43f6fff0cde"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:36:10.219749Z",
     "start_time": "2025-02-26T00:36:10.209554Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 分组聚合\n",
    "agg = pd.concat([add['label'], add['count']], axis=1)\n",
    "grouped = agg.groupby('label')['count'].sum()\n",
    "grouped"
   ],
   "id": "a047e4a585ec19fb",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "label\n",
       "上古洪荒     117\n",
       "东方玄幻    1228\n",
       "乡村生活      82\n",
       "传统武侠      16\n",
       "侦探推理      31\n",
       "其他竞技       4\n",
       "历史传记       3\n",
       "原生幻想      16\n",
       "古典仙侠     129\n",
       "商战职场     189\n",
       "奇门秘术      73\n",
       "官场         9\n",
       "寻宝探险      31\n",
       "幻想修真     194\n",
       "异世大陆     456\n",
       "恐怖灵异     102\n",
       "战争幻想       4\n",
       "抗战烽火       7\n",
       "明星娱乐     112\n",
       "星际机甲      17\n",
       "未来幻想      71\n",
       "末世危机     248\n",
       "架空历史     698\n",
       "武侠幻想      44\n",
       "灵气复苏      63\n",
       "热血校园     107\n",
       "王朝争霸      23\n",
       "现代军事      24\n",
       "现实百态      23\n",
       "电子竞技      28\n",
       "穿越历史     342\n",
       "篮球风云      46\n",
       "虚拟网游     107\n",
       "衍生同人     416\n",
       "西方奇幻      20\n",
       "诸天万界      85\n",
       "谍战特工       2\n",
       "足球天下      25\n",
       "轻小说        6\n",
       "都市生活     391\n",
       "都市高手     960\n",
       "都市高武    1169\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:36:10.963593Z",
     "start_time": "2025-02-26T00:36:10.959449Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 构建词云图\n",
    "from pyecharts.charts import WordCloud"
   ],
   "id": "141edb4032b5a4c",
   "outputs": [],
   "execution_count": 5
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:37:20.529690Z",
     "start_time": "2025-02-26T00:37:20.514628Z"
    }
   },
   "cell_type": "code",
   "source": [
    "word = WordCloud()\n",
    "word.add(\"\", [list(z) for z in zip(grouped.index, grouped)], word_size_range=[12, 66])\n",
    "word.set_global_opts(\n",
    "        title_opts=opts.TitleOpts(\n",
    "            title=\"高频标签组合\", title_textstyle_opts=opts.TextStyleOpts(font_size=23), \n",
    "            pos_left=\"center\", pos_bottom=\"0%\"\n",
    "        )\n",
    "    )\n",
    "word.render(\"D:\\sevencats-novels-analysis\\images\\七猫高频标签组合.html\")"
   ],
   "id": "5a549db669d30c7b",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'D:\\\\sevencats-novels-analysis\\\\images\\\\七猫高频标签组合.html'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 9
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## 二、字数-评分：散点图观察相关性",
   "id": "aed599be60b3d19e"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:37:43.820902Z",
     "start_time": "2025-02-26T00:37:43.800314Z"
    }
   },
   "cell_type": "code",
   "source": [
    "agg = pd.concat([add['wordcount'], add['score']], axis=1)\n",
    "sort = agg.sort_values('wordcount')\n",
    "sort"
   ],
   "id": "fc3ef7ba74470d92",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "      wordcount  score\n",
       "4099       0.68    8.0\n",
       "2379       2.00    9.2\n",
       "3735       2.31    8.0\n",
       "3778       2.97    8.0\n",
       "2916       3.01    8.0\n",
       "...         ...    ...\n",
       "4785    2450.26    9.3\n",
       "5928    2477.50    8.8\n",
       "4969    2590.46    9.1\n",
       "7520    2829.74    8.9\n",
       "7344    5103.03    8.8\n",
       "\n",
       "[7718 rows x 2 columns]"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>wordcount</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4099</th>\n",
       "      <td>0.68</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2379</th>\n",
       "      <td>2.00</td>\n",
       "      <td>9.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3735</th>\n",
       "      <td>2.31</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3778</th>\n",
       "      <td>2.97</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2916</th>\n",
       "      <td>3.01</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4785</th>\n",
       "      <td>2450.26</td>\n",
       "      <td>9.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5928</th>\n",
       "      <td>2477.50</td>\n",
       "      <td>8.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4969</th>\n",
       "      <td>2590.46</td>\n",
       "      <td>9.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7520</th>\n",
       "      <td>2829.74</td>\n",
       "      <td>8.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7344</th>\n",
       "      <td>5103.03</td>\n",
       "      <td>8.8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7718 rows × 2 columns</p>\n",
       "</div>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 10
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:37:45.224587Z",
     "start_time": "2025-02-26T00:37:45.220124Z"
    }
   },
   "cell_type": "code",
   "source": [
    "# 构建散点图\n",
    "from pyecharts.charts import Scatter"
   ],
   "id": "88557ad3b0cebcdf",
   "outputs": [],
   "execution_count": 11
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-02-26T00:37:46.255747Z",
     "start_time": "2025-02-26T00:37:46.190825Z"
    }
   },
   "cell_type": "code",
   "source": [
    "st = Scatter(init_opts=opts.InitOpts(width=\"1440px\", height=\"480px\"))\n",
    "st.add_xaxis(sort['wordcount'].tolist())\n",
    "st.add_yaxis(\"\", sort['score'].tolist(), label_opts=opts.LabelOpts(is_show=False))\n",
    "st.set_global_opts(\n",
    "    title_opts=opts.TitleOpts(title=\"字数与评分的相关性\", pos_left=\"center\", pos_bottom=\"2%\"),\n",
    "    xaxis_opts=opts.AxisOpts(name=\"字数（万）\", type_=\"value\", max_=5200), \n",
    "    yaxis_opts=opts.AxisOpts(name=\"评分\", type_=\"value\", min_=6, max_=10),\n",
    ")\n",
    "st.render(\"D:\\sevencats-novels-analysis\\images\\七猫字数与评分的相关性.html\")"
   ],
   "id": "1a47fa23dce0671e",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'D:\\\\sevencats-novels-analysis\\\\images\\\\七猫字数与评分的相关性.html'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 12
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
